main
## NULL
## # A tibble: 1 × 1
## .config
## <chr>
## 1 Preprocessor1_Model1
This document is a template workflow to show the machine learning development process. Included are the following tabs:
Data and features: A description of data, cleaning methodology and features
Tidymodels setup: A setup of model engines and feature engineering “recipes”
Tuning and Metrics: Executions summaries of model performance
Explain: Metrics to “explain” aspects of black box models
Map: A map to investigate if there is any geographic patterns of error
These are the parameters used in this specific run:
# Call file to load data
source(file.path(here(),input_path,"load_data.R"),echo = TRUE, local = knitr::knit_global())##
## > # Load data
## >
## > iris<-iris
##
## > vis_miss(iris,warn_large_data = F)
See data cleaning code below.
# Call file to clean data
source(file.path(here(),input_path,"clean_data.R"),echo = TRUE, local = knitr::knit_global(), max.deparse.length=1e3)##
## > df<-
## + iris %>% filter(Sepal.Length <7.1)
Below is the entire dataset (with all original columns), which you can feel free to filter and explore.
See https://custom-scripts.sentinel-hub.com/custom-scripts/sentinel/sentinel-2/ for definition of some of the different remote sensing indices.
Our Outcome variable is Sepal.Length .
Our potential predictor variables are Sepal.Width, Petal.Length, Petal.Width, Species .
Below is some exploratory data analysis showing i) a histogram of all numeric columns, and ii) the frequencies of all categorical columns in the data
PlotNumericColumns(df%>%dplyr::select(predict_vars, outcome_var))
PlotCategoricalColumns(df%>%dplyr::select(predict_vars))
# Boxplot (x = year, y = total emisisons) Facet by crop
# cor_df<-cor(df%>%dplyr::select(where(is.numeric))%>%as_tibble())
# corrplot(cor_df)Below is a correlation plot of all numeric variables
cor_df<-cor(df%>%dplyr::select(predict_vars,outcome_var)%>%dplyr::select(where(is.numeric))%>%as_tibble())
corrplot(cor_df, method = 'color', order = 'alphabet', tl.cex = 0.5)Below is the relationship between each variable and the outcome variable
PlotNumericColumnsVOutcome(df%>%dplyr::select(predict_vars,outcome_var), outcome_var)
PlotCategoricalColumnsVOutcome(df%>%dplyr::select(predict_vars,outcome_var), outcome_var)Below is all missing data.
To run these models, we will use the tidymodels framework. Each tab below represents a key step in developing a model. These steps will all come together to become a Workflowset that we are able to tune
To read more, see tidymodels.org.
Below is the code for how we split data into training and test. Note that we are also setting up 5-fold cross validation.
# browser()
# Split data into training and test set --------------------
source("scripts/training_test.R", echo = TRUE, local = knitr::knit_global())##
## > # Split data into training and test set --------------------
## > set.seed(123)
##
## > df_split <-
## + if(is.null(params$group_split_var)){
## + print("Non-Group Split")
## + initial_split(df,prop = params$split_pct, strata = params$ .... [TRUNCATED]
## [1] "Non-Group Split"
##
## > df_train <- training(df_split)
##
## > df_test <- testing(df_split)
# Split training set into k-fold cross validation for evaluation later
source("scripts/folds.R",echo = TRUE, local = knitr::knit_global())##
## > # Split training set into k-fold cross validation for evaluation later
## > df_folds<-
## + if(is.null(params$group_split_var)){
## + print("Non-Group ..." ... [TRUNCATED]
## [1] "Non-Group CV"
Below are some visualizations that show how the test and training data differ
#Visualize training and test data
PlotNumericColumns(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
dplyr::select(predict_vars, outcome_var, dataset),
fill_column = "dataset")
PlotCategoricalColumns(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
dplyr::select(predict_vars, dataset),
fill_column = "dataset")
# df %>% ggplot(aes(x = area_ac, y = total_co2eq_kg_ac, colour = region)) +geom_point()PlotNumericColumnsVOutcome(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
dplyr::select(predict_vars, outcome_var, dataset),
outcome = outcome_var,
colour_column = "dataset")
PlotCategoricalColumnsVOutcome(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
dplyr::select(predict_vars, dataset, outcome_var),
outcome = outcome_var,
fill_column = "dataset")Below are the specs (models) we are considering in our workflow.
##
## > rf_spec<-
## + rand_forest(trees = 1000,
## + #The best mtry from tuning appeared to be 15
## + mtry = 20,
## + min_ .... [TRUNCATED]
##
## > rf_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 20
## trees = 1000
## min_n = 2
##
## Engine-Specific Arguments:
## keep.inbag = F
## importance = impurity
## seed = 700
## replace = T
##
## Computational engine: ranger
##
##
## > rf_spec_randomForest<-
## + rand_forest(trees = 1000,
## + #The best mtry from tuning appeared to be 15
## + mtry = 10,
## + .... [TRUNCATED]
##
## > rf_spec_randomForest
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## trees = 1000
##
## Computational engine: randomForest
##
##
## > xgb_spec <-
## + boost_tree( tree_depth = 5,
## + trees = 400,
## + mode = "regression"
## + ) %>%
## + set_engine("xgboost")
##
## > xgb_spec
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = 400
## tree_depth = 5
##
## Computational engine: xgboost
##
##
## > lm_spec <-
## + linear_reg()
##
## > lm_spec
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
##
##
## > knn_spec <-
## + nearest_neighbor(neighbors = 8, weight_func = "triangular") %>%
## + set_engine("kknn") %>%
## + set_mode("regression")
##
## > knn_spec
## K-Nearest Neighbor Model Specification (regression)
##
## Main Arguments:
## neighbors = 8
## weight_func = triangular
##
## Computational engine: kknn
##
##
## > nnet_spec<-
## + mlp()%>%
## + set_mode("regression")%>%
## + set_engine("nnet")
##
## > nnet_spec
## Single Layer Neural Network Model Specification (regression)
##
## Computational engine: nnet
##
##
## > cubist_spec<-
## + cubist_rules(
## + mode = "regression",
## + committees = NULL,
## + neighbors = NULL,
## + max_rules = NULL,
## + engine = " ..." ... [TRUNCATED]
##
## > bart_spec<-
## + parsnip::bart(trees = 236,
## + prior_terminal_node_coef = 0.953,
## + prior_terminal_node_expo = 0.788
## + .... [TRUNCATED]
##
## > bart_spec
##
## Call:
## NULL
##
##
## > lasso_spec <- linear_reg(penalty = 0.1, mixture = 1)%>%
## + set_engine("glmnet")
##
## > lasso_spec
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.1
## mixture = 1
##
## Computational engine: glmnet
##
##
## > stan_spec <-
## + linear_reg() %>%
## + set_engine("stan")
##
## > stan_spec
## Linear Regression Model Specification (regression)
##
## Computational engine: stan
##
##
## > # Combine all specs into a list
## > model_list <-list("xgb" = xgb_spec,
## + "rf"= rf_spec,
## + "rfrandomForest" = rf_s .... [TRUNCATED]
We may be considering different sets of predictors and/or different feature engineering steps. We define those below.
##
## > FeatureEngineering<- function(rec){
## + rec%>%
## + step_mutate(across(where(is.logical), as.numeric)) %>%
## + # Remove categorical variables tha .... [TRUNCATED]
##
## > df_rec <-recipe(
## + Formulate(y = outcome_var, x= predict_vars),
## + data = df_train
## + )%>%
## + FeatureEngineering()
##
## > df_rec_width <-recipe(
## + Formulate(y = outcome_var, x= c("Sepal.Width","Petal.Width")),
## + data = df_train
## + )%>%
## + FeatureEngineering()
##
## > preproc_list<-list("main" = df_rec,
## + "mainwidth" = df_rec_width)
Recipes and Specs combine to create a workflowset, which is just an object that contains each model we will be training.
# Create workflowset
source("scripts/create_workflowset.R",echo = TRUE, local = knitr::knit_global())##
## > #Remove models we're not interested in running
## > model_list<-model_list[names(model_list) %in% params$models_to_run]
##
## > #Remove recipes were not interested in
## > preproc_list<-preproc_list[names(preproc_list) %in% params$recipes_to_run]
##
## > # This is where we define what's a part of the workflow set
## > df_wf_set <-
## + workflow_set( # NOTE - ensure that preproc does not contain underscor .... [TRUNCATED]
In total there are 4 total models, which we summarize below:
df_wf_set%>%
rowwise()%>%
mutate(
predictors = paste(info$workflow[[1]]$pre$actions$recipe$recipe$var_info%>%filter(role == "predictor")%>%pull(variable), collapse = "<br>"),
outcome = paste(info$workflow[[1]]$pre$actions$recipe$recipe$var_info%>%filter(role == "outcome")%>%pull(variable), collapse = ","),
outcome_predictors = paste("<b>Outcome</b>",
outcome,
"<br><b>Predictors</b>",
predictors,
sep = "<br>"),
operations = paste(capture.output(info$workflow[[1]]$pre$actions$recipe$recipe)[-(1:10)], collapse = "<br><br>"),
workflow_output = paste(capture.output(info$workflow, type = "output"), collapse = "<br>"))%>%
dplyr::select(wflow_id, outcome_predictors:workflow_output)%>%
reactable(
columns = list(workflow_output = colDef(html = TRUE, width = 1000),
operations = colDef(html = TRUE),
outcome_predictors = colDef(html = TRUE, width = 250)
),
outlined = TRUE,
highlight = TRUE,
striped = TRUE,
showSortable = TRUE,
compact = FALSE,
bordered = TRUE,
defaultExpanded = F,
filterable = TRUE,
showPageSizeOptions = TRUE)Workflowsets are first tuned in the code snippet below:
# browser()
#Run workflow set through cross validation ----
source("scripts/tune_cv.R",echo = TRUE, local = knitr::knit_global())##
## > registerDoParallel(cores = parallel::detectCores(logical = FALSE))
##
## > grid_ctrl <-
## + control_grid(
## + save_pred = TRUE,
## + parallel_over = "everything",
## + save_workflow = FALSE
## + )
##
## > #Run workflow set through cross validation -----------------------
## > tuned_workflows<-workflow_map(
## + df_wf_set%>%
## + option_add(control = cont .... [TRUNCATED]
## Warning: There are existing options that are being modified
## main_xgb: 'control'
## main_rf: 'control'
## mainwidth_xgb: 'control'
## mainwidth_rf: 'control'
#t<-extract_workflow_set_result(tuned_workflows, "mainall_rf")
# autoplot(tuned_workflows)
# tuned_workflows[6,][["result"]][[1]][[".notes"]]
# Save tuned workflow to S3
# if (params$save_results){
# WriteToS3(df = tuned_workflows,
# s3name = paste0(params$run_name, "_tuned"),
# type = "RDS",
# obj = data_directory)
# }# Combine mdoels with stacks
# https://stacks.tidymodels.org/articles/basics.html
#https://stacks.tidymodels.org/articles/workflowsets.html
# https://www.youtube.com/watch?v=HZXZxvdpDOg&t=2160s
df_wf_set_stack<-
#Initiliaze the stacks
stacks()%>%
add_candidates(tuned_workflows)%>%
# determine how to combine their predictions
# Look deeper into setting up a meta model
# include fewer models by proposing higher penalties
blend_predictions(penalty = 10^seq(-2, -0.5, length = 20))%>%
# fit the candidates with nonzero stacking coefficients
fit_members()
# Relevant when tuning:
# collect_parameters(df_wf_set_stack, "rf")
# TODO: Save stacked model to S3Tuned models are saved [HERE] on S3.
ranked_results<-
rank_results(tuned_workflows)%>%
tidyr::extract(wflow_id, c("recipe", "model"), "([^_]+)_(.*)")
ranked_results%>%
ggplot(aes(x = rank, y = mean, colour = model, shape = recipe))+
geom_point()+
geom_errorbar(aes(ymin = mean - std_err,
ymax = mean + std_err))+
scale_colour_iwanthue() +
scale_x_continuous(breaks= pretty_breaks())+
labs(y="")+
facet_wrap(~.metric, scales = "free")# Plot of predictions from each fold
collect_predictions(tuned_workflows, summarize = F)%>%
ggplot(aes(y = .pred, x = !!sym(outcome_var), colour = id))+
geom_point(alpha = 0.1)+
facet_wrap(~wflow_id)+
scale_colour_iwanthue()+
guides(colour = guide_legend(override.aes = list(alpha=1)))+
labs(colour = "")# V2
# collect_predictions(tuned_workflows, summarize = F)%>%
# ggplot(aes(y = .pred, x = !!sym(outcome_var), colour = id))+
# geom_point(alpha = 0.1)+
# facet_grid(wflow_id~id)+
# scale_colour_iwanthue()+
# guides(colour = guide_legend(override.aes = list(alpha=1)))
FormatReactable(rank_results(tuned_workflows))After each model is trained, we can generate an ensemble model via stacking:
print(df_wf_set_stack)
autoplot(df_wf_set_stack)
# autoplot(df_wf_set_stack, type = "weights")
ensemble_preds<-
df_test%>%
mutate(predict(df_wf_set_stack, .))%>%
mutate(ensemble_error = !!sym(outcome_var) - .pred)
member_preds_test <-
df_test %>%
dplyr::select(!!sym(outcome_var))%>%
# Include all members as an additional
bind_cols(predict(df_wf_set_stack, df_test, members = TRUE))
# Member predictions compared to ensemble
member_stats_test<-
map_dfr(colnames(member_preds_test), rmse, truth = !!sym(outcome_var), data = member_preds_test) %>%
mutate(member = colnames(member_preds_test))
# On training data
member_preds_train<-
df_train %>%
dplyr::select(!!sym(outcome_var))%>%
# Include all members as an additional
bind_cols(predict(df_wf_set_stack, df_train, members = TRUE))
member_stats_train<-
map_dfr(colnames(member_preds_train), rmse, truth = !!sym(outcome_var), data = member_preds_train) %>%
mutate(member = colnames(member_preds_train))
ensemble_preds%>%
ggplot(aes(x = !!sym(outcome_var), y = .pred))+
geom_point()+
geom_abline(linetype = "dashed",
color = "gray")+
labs(x = "actual", y = "predicted", title = "Stacking Test Actual vs Predicted", subtitle= paste0("(RMSE: ", round(member_stats_test%>%filter(member == ".pred")%>%pull(.estimate), 2), ")"))+
theme_bw()
FormatReactable(member_stats_test)# Tune individual workflows
source("scripts/tune_workflows.R",echo = TRUE, local = knitr::knit_global())##
## > fits<- lapply(df_wf_set$wflow_id,
## + TuneExtractModel,
## + wf =if(params$n_folds > 1){tuned_workflows}else{df_wf_set},
## + .... [TRUNCATED]
## [1] "main_xgb"
## [1] "main_rf"
## → A | warning: 20 columns were requested but there were 5 predictors in the data. 5 will be used.
##
There were issues with some computations A: x1
There were issues with some computations A: x1
## [1] "mainwidth_xgb"
## [1] "mainwidth_rf"
## → A | warning: 20 columns were requested but there were 2 predictors in the data. 2 will be used.
##
There were issues with some computations A: x1
There were issues with some computations A: x1
##
## > names(fits) <- df_wf_set$wflow_id
#example use:
#fits[['main_rf']][[2]]
# browser()
if (params$save_results){
saveRDS(fits[[params$model_to_save]][[1]], paste0("~/downloads/", params$run_name,"_", params$model_to_save,"_fit",".RDS"))
# WriteToS3(df = fits[[params$model_to_save]][[1]]%>%extract_fit_parsnip(),
# s3name = paste0(params$run_name,"_", params$model_to_save,"_fit"),
# type = "RDS",
# obj = data_directory)
}Below is a summary of RMSEs across the training data, Cross-validated model set, and test set for all 4 models plus the ensemble.
rmses<-
lapply(df_wf_set$wflow_id, function(e){
tibble(m = e,
test_rmse = fits[[e]][[3]],
train_rmse = fits[[e]][[4]]
)
})%>%
bind_rows()
if(params$n_folds > 1){
rmses<-
rmses%>%
left_join(
ranked_results%>%
mutate(m = paste0(recipe,"_", model))%>%
filter(.metric == "rmse")%>%
dplyr::select(m, "cv_rmse" = mean),
by = "m"
)
}
if(params$run_stack){
rmses<-
rmses %>%
bind_rows(member_stats_test%>%
filter(member == ".pred")%>%
transmute(m = "ensemble", test_rmse = .estimate)%>%
left_join(member_stats_train%>%
filter(member == ".pred")%>%
transmute(m = "ensemble",
train_rmse = .estimate), by = "m")
)%>%
arrange(test_rmse)%>%
mutate(rank = 1:n())
}
FormatReactable(rmses)Below is a grid of how the errors across models are correlated
error_crossmodel<-
lapply(df_wf_set$wflow_id, function(e){
fits[[e]][[8]]%>%
dplyr::select(.pred, error, model, !!sym(outcome_var))%>%
mutate(wflow_id = e,
id = 1:n())
})%>%
bind_rows()%>%
arrange(- !!sym(outcome_var))%>%
filter(model == "Test")
final<-
lapply(df_wf_set$wflow_id, function(e){
error_crossmodel%>%
mutate(m = e)%>%
group_by(id)%>%
mutate(val = error[wflow_id == e])%>%
group_by(m,wflow_id)%>%
summarise(mae = Metrics::mae(error,val),
rmse = Metrics::rmse(error,val),
cor = cor(error,val))
})%>%
bind_rows()%>%
dplyr::select(m, wflow_id, cor)%>%
spread(wflow_id, cor)%>%
arrange(m)
FormatReactable(final)gg1<-error_crossmodel%>%
group_by(id)%>%
mutate(max_error = max(error),
min_error = min(error))%>%
ggplot()+
geom_point(aes(x = !!sym(outcome_var), y = error, colour = wflow_id),
alpha = 0.5, size = 0.5)+
geom_segment(aes(y = max_error, yend= min_error,x = !!sym(outcome_var),xend=!!sym(outcome_var), group = id),
linewidth =0.01)+
geom_abline(slope = 0, colour = "red")+
facet_wrap(~model, nrow = 1, scales = "free_x")+
scale_colour_iwanthue()+
labs(x = "Actual Value")
# Version 2
gg2<-error_crossmodel%>%
group_by(id)%>%
mutate(max_error = max(error),
min_error = min(error))%>%
ungroup()%>%
mutate(id = fct_reorder(as.character(id), -error))%>%
ggplot()+
geom_point(aes(x = id, y = error, colour = wflow_id),
alpha = 0.5, size = 0.5)+
geom_segment(aes(y = max_error, yend= min_error,x = id,xend= id, group = id),
linewidth =0.01)+
geom_abline(slope = 0, colour = "red")+
facet_wrap(~model, nrow = 1, scales = "free_x")+
scale_colour_iwanthue()+
theme(axis.text.x=element_blank()) +
labs(x = "Observation")
grid.arrange(gg1, gg2, nrow = 1)GenerateChunk<-
function(nm, m) {
template<-
c(
"### {{nm}}\n",
"```{r, echo = FALSE, fig.width=12, out.width='100%', dpi=300}\n",
#"fits[['{{nm}}_{{m}}']][[1]]%>%extract_fit_engine()%>%print()\n",
"fits[['{{nm}}_{{m}}']][[5]]\n",
"fits[['{{nm}}_{{m}}']][[7]]\n",
"fits[['{{nm}}_{{m}}']][[2]]\n",
"fits[['{{nm}}_{{m}}']][[9]]\n",
"fits[['{{nm}}_{{m}}']][[10]]\n",
"```\n",
"\n"
)
return(knitr::knit_expand(text = template))
}
text<- c()
for (a in params$models_to_run){
text<-c(text, paste0("## ",a, "{.tabset}"))
# Split into all the different recipes
text<-
c(text,
lapply(params$recipes_to_run, GenerateChunk, m = a)%>%unlist()
)
}## NULL
## # A tibble: 1 × 1
## .config
## <chr>
## 1 Preprocessor1_Model1
## NULL
## # A tibble: 1 × 1
## .config
## <chr>
## 1 Preprocessor1_Model1
## NULL
## # A tibble: 1 × 1
## .config
## <chr>
## 1 Preprocessor1_Model1
## NULL
## # A tibble: 1 × 1
## .config
## <chr>
## 1 Preprocessor1_Model1
You can use this chunk to customize some results! If you don’t provide an “other_chunks/other_chunks.Rmd” in your directory, this tab won’t show up!
# This is how to run TuneExtractModel (more for testing)
fit_xgb<-TuneExtractModel(tuned_wf =tuned_workflows,
string_model = "main_xgb",
training_data = df_train,
testing_data = df_test)
fit_xgb_best<- fit_xgb[[1]]
fit_xgb[[2]]
xgb_preds <-
df_test %>%
bind_cols(predict(fit_xgb_best, df_test))
grid.arrange(
fit_xgb_best%>%
pull_workflow_fit()%>%
vip(geom = "col")
)#Compare Models
# TODO: Turn into mini widget where you can compare the predictions of two models
xgb_rf<-
xgb_preds%>%
rename("xgb_pred" = .pred)%>%
mutate(xgb_error = !!sym(outcome_var) - xgb_pred )%>%
left_join(rf_preds%>%dplyr::select(id_hash, "rf_pred" = .pred), by = "id_hash")
xgb_rf%>%
ggplot(aes(x = xgb_pred, y = rf_pred, colour = xgb_error))+
geom_point()+
geom_abline(linetype = "dashed",
color = "gray")+
scale_colour_distiller(type = "div")
xgb_rf%>%
ggplot(aes(x = time_harvest_year, y = !!sym(outcome_var), colour = xgb_error))+
geom_point()+
geom_abline(linetype = "dashed",
color = "gray")+
scale_colour_distiller(type = "div")
#### Test Errors ---------------------
#Difference in test error
# What do all the ensemble observaitons looks like compared to individual models??
error_df<-
xgb_rf%>%
mutate(rf_error = !!sym(outcome_var) - rf_pred)%>%
left_join(ensemble_preds%>%dplyr::select(id_hash, "ensemble_pred" = .pred, ensemble_error), by = "id_hash")%>%
arrange(-ensemble_error)%>%
# Set order of plot to
mutate(id_hash = factor(id_hash, levels = id_hash))%>%
gather(var, val, contains("_error"))
# Error widget
GenerateErrorBars(error_df, "total_emissions")
GenerateErrorBars(error_df, "ipcc_climate")
# GenerateErrorBars(error_df, "crop")
# GenerateErrorBars(error_df, "rs_yield")
WidgetErrorBar = function(dataset) {
library(shiny)
vars = colnames(dataset)
shinyApp(
ui = fluidPage(
fluidRow(style = "padding-bottom: 20px;",
column(12, selectInput('fill_var', 'Fill Variable', vars, selected = "total_emissions"))
),
fluidRow(
plotlyOutput('error_bar_plot', height = "400px")
)
),
server = function(input, output, session) {
# Combine the selected variables into a new data frame
output$error_bar_plot = renderPlotly({
ggplotly(GenerateErrorBars(error_df, input$fill_var))
})
},
options = list(height = 500)
)
}
# WidgetErrorBar(error_df)Below is a set of diagnostics aimed to make our models more
explainable. Note here that these are developed using the
DALEX package.
For more information on Explainable ML, see the following resources:
Explanatory Model Analysis (Dalex) by Przemyslaw Biecek and Tomasz Burzykowski
Interpretable Machine Learning by Christoph Molnar
Hands on Machine Learning by Bradley Boehmke
The following diagnostics are at the dataset (global) level, meaning they average behavior at the highest level of the machine learning algorithm. These methods are model-agnostic that describe average behavior of the model.
Note: All Global diagnostic plots from the DALEX package
below are based on plugging in the training data (as opposed to the test
data).
# Model performance
lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
model_performance(explainer)
})%>%
plot(geom = "histogram")
#plot(resids_rf, resids_xgb, geom = "boxplot")
lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
model_diagnostics(explainer)
})%>%
plot(variable = "y", yvariable = "residuals")## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Dalex variable importance
v<-
lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
variable_importance(explainer, loss_function = loss_root_mean_square, B = 1)
})
v%>%plot(max_vars = 15)model_vars <- v%>%
bind_rows()%>%
group_by(label)%>%
filter(dropout_loss != dropout_loss[variable == "_full_model_"])%>%
filter(variable!="_baseline_")%>%
arrange(-dropout_loss)%>%
pull(variable)%>%
unique()
top_three_most_important_vars<-model_vars[1:3]
# single_variable(explainer_rf, variable = "rs_yield")Below is a set of plots defined below:
Source: https://christophm.github.io/interpretable-ml-book/ale.html#motivation-and-intuition
#PDP
p1<-lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
model_profile(explainer, type = "partial",N = 50, variables = model_vars)
})%>%
plot()
# Local Dependence Profiles
p2<-lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
model_profile(explainer, type = "conditional", N = 50, variables = model_vars)
})%>%
plot()+
ggtitle("Local Dependence profile")
# Accumulated Dependence Profiles
p3<-lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
model_profile(explainer, type = "accumulated", N = 50, variables = model_vars)
})%>%
plot()
(p1 | p2 | p3)# All 3 together
#TODO: Use objects from previous
lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
a<-model_profile(explainer, type = "accumulated", N = 50, variables = model_vars)
a$agr_profiles$`_label_` = "accumulated local"
b<-model_profile(explainer, type = "conditional", N = 50, variables = model_vars)
b$agr_profiles$`_label_` = "local dependence"
c<-model_profile(explainer, type = "partial", N = 50, variables = model_vars)
c$agr_profiles$`_label_` = "partial dependence"
plot(a,b,c)+
ggtitle(paste0("Dependence Profiles (",e ,")"), subtitle = "")
})%>%
grid.arrange(grobs = ., nrow = 1)The following diagnostics are at the instance (local) level, meaning they are at the prediction level of the machine learning algorithm. These methods are useful to break down and understand model predictions for a given data point.
In our diagnostics below, we will use the following data point(s):
# Choose an observation of interest
new_obs<-df_test%>%head(1)#%>%dplyr::select(predict_vars)
FormatReactable(new_obs)Note that the ordering of breakdown plots matters.
# Break down plot
# Note that the ordering of breakdown matters and these are additive.
# lapply(params$models_to_explain, function(e){
#
# explainer<-fits[[e]][[6]]
# predict_parts(explainer,
# new_observation = new_obs,
# type = "break_down")%>%
# plot()
# })%>%
# grid.arrange(grobs = .)
# Break down plot (with distributions)
lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
predict_parts(explainer,
new_observation = new_obs,
type = "break_down",
keep_distributions = T,
order = top_three_most_important_vars)%>%
plot(plot_distributions = TRUE)
})%>%
grid.arrange(grobs = .)
# Break down interactions
# Ordering also really matters here.
# lapply(params$models_to_explain, function(e){
#
# explainer<-fits[[e]][[6]]
# predict_parts(explainer,
# new_observation = new_obs,
# type = "break_down_interactions",
# interaction_preference = 2)%>%
# plot()
# })%>%
# grid.arrange(grobs = .)
# Shapley
# shap_rf <- predict_parts(explainer_rf, new_observation = new_obs, type = "shap", B = 50)
# plot(shap_rf)# Ceteris Paribus Profile
# lapply(params$models_to_explain, function(e){
#
# explainer<-fits[[e]][[6]]
# predict_profile(explainer,
# new_observation = new_obs)
# })%>%
# plot(variables = c("rs_yield"),
# color = "_label_"
# )+
# ggtitle("Ceteris-paribus profile", "")
# plot(cp_rf, variables = c("hist_prevcrop"),
# variable_type = "categorical", categorical_type = "bars")
# ggtitle("Ceteris-paribus profile", "")
#Local stability
p1<-lapply(params$models_to_explain, function(e){
n<- 30
explainer<-fits[[e]][[6]]
predict_diagnostics(explainer,
new_obs,
neighbors = n)%>%
plot()+
ggtitle(paste0("Distribution of Residuals: ", e))
})%>%
grid.arrange(grobs = .)
# Variable specific stability
p2<-lapply(params$models_to_explain, function(e){
explainer<-fits[[e]][[6]]
predict_diagnostics(explainer,
new_obs,
neighbors = 30,
variables = top_three_most_important_vars[1])%>%
plot()
})%>%
grid.arrange(grobs = .)
# grid.arrange(p1, p2, nrow = 1)# Trying to put this on top of report:
# https://stackoverflow.com/questions/23570718/creating-summaries-at-the-top-of-a-knitr-report-that-use-variables-that-are-defi
# {r chunk-a, ref.label=c('tune_rf','param_table')}
# rmses%>%
# filter(rank == min(rank))%>%
# rename("best_model" = m)%>%
# FormatReactable(filt = F)